Import packages


In [1]:
%matplotlib inline
from __future__ import print_function

import time, datetime
import numpy as np
import pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Input, Embedding, Dense, GRU, Dropout, Reshape, Merge, Bidirectional
from keras.callbacks import Callback, ModelCheckpoint

from sklearn.manifold import TSNE


Using TensorFlow backend.

Download data files


In [2]:
! wget -q -O emoji_joined.txt https://raw.githubusercontent.com/uclmr/emoji2vec/master/data/raw_training_data/emoji_joined.txt
! wget -q http://nlp.stanford.edu/data/glove.6B.zip
! unzip -q -o glove.6B.zip
! rm -f glove.6B.zip glove.6B.50d.txt glove.6B.100d.txt glove.6B.200d.txt

Initialize global variables


In [3]:
GLOVE_FILE = 'glove.6B.300d.txt'
EMOJI_DESCRIPTIONS_FILE = 'emoji_joined.txt'
EMOJI_EMB_VIZ_FILE = 'emoji_emb_viz.csv'
MODEL_WEIGHTS_FILE = 'weights.h5'
EMOJI_EMBEDDINGS_FILE = 'emoji_embeddings.txt'

MAX_SEQUENCE_LENGTH = 15
MAX_NB_WORDS = 5000
MAX_NB_EMOJIS = 2000
EMBEDDING_DIM = 300

RNG_SEED_1 = 1446557
RNG_SEED_2 = 1337603
VALIDATION_SPLIT = 0.1

Load emojis


In [4]:
emoji_descriptions = pd.read_csv(EMOJI_DESCRIPTIONS_FILE, 
                                 sep='\t', 
                                 engine='python', 
                                 encoding='utf_8',
                                 names=['description', 'emoji'])

print('Emoji descriptions: %d' % len(emoji_descriptions))


Emoji descriptions: 6088

In [5]:
emoji_descriptions.head(5)


Out[5]:
description emoji
0 ballot box with check ☑️
1 full moon with face 🌝
2 cheese 🌝
3 moon 🌝
4 smiling moon 🌝

In [6]:
neg_emoji_descriptions = pd.DataFrame({'emoji': emoji_descriptions['emoji'].values, 
                                       'description': emoji_descriptions.sample(frac=1, 
                                                                                random_state=RNG_SEED_1)['description'].values})

In [7]:
neg_emoji_descriptions.head(5)


Out[7]:
description emoji
0 flag for jordan ☑️
1 middle school 🌝
2 menorah 🌝
3 old man 🌝
4 jazz 🌝

In [8]:
emoji_descriptions['label'] = 1
neg_emoji_descriptions['label'] = 0
emoji_data = pd.concat([emoji_descriptions, neg_emoji_descriptions]).sample(frac=1, random_state=RNG_SEED_2)

In [9]:
emoji_data.head(10)


Out[9]:
description emoji label
4078 arab ▫️ 0
468 black man with turban 🈵 0
2407 sad cat 😿 1
1530 flag for brazil 🇧🇷 1
4797 accessible bathroom 💴 0
5890 white left pointing backhand index 🍳 0
5401 blond 👱 1
2520 face with open mouth and cold sweat 😰 1
747 slow 🐢 1
5770 mobile phone 📱 1

In [10]:
emoji_series = emoji_descriptions.groupby('emoji')['description'].apply(lambda x: ', '.join(x))
emojis_combined_desc = pd.DataFrame({'emoji': emoji_series.index, 'description': emoji_series.values})

print('Emojis: %d' % len(emojis_combined_desc))


Emojis: 1661

In [11]:
emojis_combined_desc[emojis_combined_desc['description'].str.contains('new york')]


Out[11]:
description emoji
606 slice of pizza, pie, italy, pepperoni pizza, s... 🍕
1362 statue of liberty, new york 🗽
1506 taxicab, city, new york taxi, car, service, au... 🚕

Build emoji index


In [12]:
emojis = emojis_combined_desc['emoji'].values
emoji_index = {}
emoji_reverse_index = {}
i = 0
for e in emojis:
    i += 1
    emoji_index[e] = i
    emoji_reverse_index[i] = e

print("Emojis in index: %d" % len(emoji_index))


Emojis in index: 1661

Build word index


In [13]:
descriptions = emoji_data['description'].values
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(descriptions.tolist())
desc_word_sequences = tokenizer.texts_to_sequences(descriptions.tolist())
word_index = tokenizer.word_index

print("Words in index: %d" % len(word_index))


Words in index: 3364

Load GloVe word embeddings


In [14]:
embeddings_index = {}
with open(GLOVE_FILE) as f:
    for line in f:
        values = line.split()
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings: %d' % len(embeddings_index))


Word embeddings: 400000

Prepare word embedding matrix


In [15]:
nb_words = min(MAX_NB_WORDS, len(word_index))
word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        word_embedding_matrix[i] = embedding_vector
        
print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0))


Null word embeddings: 72

Prepare training data


In [16]:
e_data = np.array([ emoji_index[e] for e in emoji_data['emoji'].values ])
d_data = pad_sequences(desc_word_sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array([ [0, 1] if l == 0 else [1, 0] for l in emoji_data['label'].values ])
nb_emojis = min(MAX_NB_EMOJIS, len(emoji_index))

print('Shape of emoji data tensor:', e_data.shape)
print('Shape of description data tensor:', d_data.shape)
print('Shape of label tensor:', labels.shape)
print('Number of emojis:', nb_emojis)


Shape of emoji data tensor: (12176,)
Shape of description data tensor: (12176, 15)
Shape of label tensor: (12176, 2)
Number of emojis: 1661

Define model


In [17]:
P = Sequential()
P.add(Embedding(nb_emojis + 1, EMBEDDING_DIM, input_length=1))
P.add(Reshape((EMBEDDING_DIM,)))
Q = Sequential()
Q.add(Embedding(nb_words + 1, 
                EMBEDDING_DIM, 
                weights=[word_embedding_matrix], 
                input_length=MAX_SEQUENCE_LENGTH, 
                trainable=False))
Q.add(Bidirectional(GRU(EMBEDDING_DIM, dropout_W=0.5, dropout_U=0.5), merge_mode='sum'))
model = Sequential()
model.add(Merge([P, Q], mode='concat'))
model.add(Dropout(0.5))
model.add(Dense(EMBEDDING_DIM*2, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

Train model


In [18]:
callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE, monitor='val_categorical_accuracy', save_best_only=True)]

print("Starting training at", datetime.datetime.now())

t0 = time.time()
history = model.fit([e_data, d_data], 
                    labels, 
                    nb_epoch=80, 
                    validation_split=VALIDATION_SPLIT, 
                    verbose=0, 
                    callbacks=callbacks)
t1 = time.time()

print("Training ended at", datetime.datetime.now())

print("Minutes elapsed: %f" % ((t1 - t0) / 60.))


Starting training at 2016-12-02 21:10:21.672063
Training ended at 2016-12-02 21:31:24.765012
Minutes elapsed: 21.051547

Plot accuracy


In [19]:
acc = pd.DataFrame({'epoch': [ i + 1 for i in history.epoch ],
                    'training': history.history['categorical_accuracy'],
                    'validation': history.history['val_categorical_accuracy']})
ax = acc.ix[:,:].plot(x='epoch', figsize={7,10}, grid=True)
ax.set_ylabel("categorical accuracy")
ax.set_ylim([0.0,1.0]);


Plot loss


In [20]:
loss = pd.DataFrame({'epoch': [ i + 1 for i in history.epoch ],
                     'training': history.history['loss'],
                     'validation': history.history['val_loss']})
ax = loss.ix[:,:].plot(x='epoch', figsize={7,10}, grid=True)
ax.set_ylabel("loss")
ax.set_ylim([0.0,2.0]);


Extract emoji embeddings from the best model checkpoint and save them to a file


In [21]:
model.load_weights(MODEL_WEIGHTS_FILE)
weights = P.layers[0].get_weights()[0]
embeddings = pd.DataFrame(weights[1:])
embeddings = pd.concat([emojis_combined_desc['emoji'], embeddings], axis=1)

embeddings.to_csv(EMOJI_EMBEDDINGS_FILE, sep=' ', header=False, index=False)

Plot t-SNE visualization and save it to a file


In [22]:
tsne2 = TSNE(n_components=2, perplexity=30, init='pca', n_iter=5000)
fit = tsne2.fit_transform(weights)
visualization = pd.DataFrame(fit[1:], columns=['x', 'y'])
visualization['emoji'] = emojis_combined_desc['emoji'].values
visualization.plot('x', 'y', kind='scatter', figsize={7,10}, grid=True);

visualization.to_csv(EMOJI_EMB_VIZ_FILE)



In [ ]: